# datos estadisticos basicos, grabar en un archivo, biblioteca DT (problema 3)
# creado 2020-04-25  
# Autor: GAD
# ultima modificacion: 2021-04-16
# clase 2
####################################
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
datos <- read_delim("datos_clase2.csv", 
                           ";", escape_double = FALSE, locale = locale(grouping_mark = ""), 
                           trim_ws = TRUE)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   pais = col_character(),
##   codigo = col_double(),
##   cantHabitantes = col_double(),
##   casos = col_double()
## )
# ejemplo para grabar los datos en un archivo .csv
write.csv2(datos, "datos_clase2.csv", row.names = FALSE, fileEncoding = "UTF-8")

########################  resumen estadistico  basico 

colnames(datos)
## [1] "pais"           "codigo"         "cantHabitantes" "casos"
datos$pais <- as.factor(datos$pais)

summary(datos$cantHabitantes)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 8.090e+02 2.163e+06 9.228e+06 4.077e+07 2.965e+07 1.439e+09
summary(datos)
##                   pais         codigo      cantHabitantes     
##  Afghanistan        :  1   Min.   :  4.0   Min.   :8.090e+02  
##  Albania            :  1   1st Qu.:203.8   1st Qu.:2.163e+06  
##  Algeria            :  1   Median :424.0   Median :9.228e+06  
##  Andorra            :  1   Mean   :427.2   Mean   :4.077e+07  
##  Angola             :  1   3rd Qu.:649.2   3rd Qu.:2.965e+07  
##  Antigua and Barbuda:  1   Max.   :954.0   Max.   :1.439e+09  
##  (Other)            :184   NA's   :2                          
##      casos         
##  Min.   :       1  
##  1st Qu.:    9650  
##  Median :   90470  
##  Mean   :  731978  
##  3rd Qu.:  338279  
##  Max.   :31495649  
## 
#######################  mejoramos
# podemos crear nuestra propia funcion usando summarise()

#  sumarizamos los datos de nuestro Data Frame
datos %>%
  summarise(poblacion_mundial  = sum(cantHabitantes) ,
            avg_cantidad       = mean(cantHabitantes),
            min_cantidad       = min(cantHabitantes),
            max_cantidad       = max(cantHabitantes),
            cant_paises        = n_distinct(pais),
            avg_casos          = mean(casos),
            casos_por_cien     = 100  * sum(casos)/sum(cantHabitantes),
            sum_casos          = sum(casos),
            ds                 = sd(cantHabitantes))
## # A tibble: 1 x 9
##   poblacion_mundial avg_cantidad min_cantidad max_cantidad cant_paises avg_casos
##               <dbl>        <dbl>        <dbl>        <dbl>       <int>     <dbl>
## 1        7746918455    40773255.          809   1439323774         190   731978.
## # ... with 3 more variables: casos_por_cien <dbl>, sum_casos <dbl>, ds <dbl>
#  completar con la funcion que uno quiera

############################### lo hacemos mejor
library(skimr)
library(dplyr)
options(width = 140)
datos$pais <-  as.factor(datos$pais)

res_skim <- skim(datos)

###############################
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
res_stat <- stat.desc(datos)

##########################

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
describe(datos)
## datos 
## 
##  4  Variables      190  Observations
## --------------------------------------------------------------------------------------------------------------------------------------------
## pais 
##        n  missing distinct 
##      190        0      190 
## 
## lowest : Afghanistan        Albania            Algeria            Andorra            Angola            
## highest: Vietnam            West Bank and Gaza Yemen              Zambia             Zimbabwe          
## --------------------------------------------------------------------------------------------------------------------------------------------
## codigo 
##        n  missing distinct     Info     Mean      Gmd      .05      .10      .25      .50      .75      .90      .95 
##      188        2      188        1    427.2    296.9     41.4     71.4    203.8    424.0    649.2    771.6    831.2 
## 
## lowest :   4   8  12  20  24, highest: 862 882 887 894 954
## --------------------------------------------------------------------------------------------------------------------------------------------
## cantHabitantes 
##         n   missing  distinct      Info      Mean       Gmd       .05       .10       .25       .50       .75       .90       .95 
##       190         0       190         1  40773255  65334968    104013    397184   2163121   9227861  29653678  71198375 127827420 
## 
## lowest :        809      33938      38137      39244      53192, highest:  220892331  273523621  331002647 1380004385 1439323774
##                                                                                                                                         
## Value      0.00e+00 2.00e+07 4.00e+07 6.00e+07 8.00e+07 1.00e+08 1.20e+08 1.40e+08 1.60e+08 2.00e+08 2.20e+08 2.80e+08 3.40e+08 1.38e+09
## Frequency       100       43       18       10        4        3        3        1        1        1        2        1        1        1
## Proportion    0.526    0.226    0.095    0.053    0.021    0.016    0.016    0.005    0.005    0.005    0.011    0.005    0.005    0.005
##                    
## Value      1.44e+09
## Frequency         1
## Proportion    0.005
## 
## For the frequency table, variable is rounded to the nearest 20000000
## --------------------------------------------------------------------------------------------------------------------------------------------
## casos 
##         n   missing  distinct      Info      Mean       Gmd       .05       .10       .25       .50       .75       .90       .95 
##       190         0       189         1    731978   1254894     159.5    2019.6    9649.8   90470.0  338278.8 1565573.8 2899647.5 
## 
## lowest :        1        3        4       19       27, highest:  4622464  5248853 13746681 14291917 31495649
##                                                                                                                                         
## Value             0   500000  1000000  1500000  2000000  2500000  3000000  3500000  4000000  4500000  5000000 13500000 14500000 31500000
## Frequency       131       29        9        5        2        4        1        1        2        2        1        1        1        1
## Proportion    0.689    0.153    0.047    0.026    0.011    0.021    0.005    0.005    0.011    0.011    0.005    0.005    0.005    0.005
## 
## For the frequency table, variable is rounded to the nearest 500000
## --------------------------------------------------------------------------------------------------------------------------------------------
#######################
psych::describe(datos)
##                vars   n        mean           sd    median     trimmed         mad min        max      range skew kurtosis          se
## pais*             1 190       95.50        54.99      95.5       95.50       70.42   1        190        189 0.00    -1.22        3.99
## codigo            2 188      427.18       256.73     424.0      424.33      328.40   4        954        950 0.06    -1.18       18.72
## cantHabitantes    3 190 40773255.03 149341361.98 9227860.5 15452769.09 12526153.81 809 1439323774 1439322965 8.20    71.66 10834361.13
## casos             4 190   731978.34   2785521.73   90470.0   208550.74   130677.85   1   31495649   31495648 8.30    80.87   202082.99
############################### formato entregable 

####  solucion al problema 1 planteado en la clase 2
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble  3.1.0     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract()   masks pastecs::extract()
## x dplyr::filter()    masks stats::filter()
## x pastecs::first()   masks dplyr::first()
## x dplyr::lag()       masks stats::lag()
## x pastecs::last()    masks dplyr::last()
## x Hmisc::src()       masks dplyr::src()
## x Hmisc::summarize() masks dplyr::summarize()
library(DT)

datos %>%
  datatable(extensions = 'Buttons',
            filter     = "top",
            class      = "display nowrap compact",
            caption    = htmltools::tags$caption(
            style      = 'caption-side: bottom; text-align: center;',
              'Table 1: ', htmltools::em('estadisticas simples sobre los confirmado')),
            options = list(dom = 'Blfrtip',
                           buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
                           lengthMenu = list(c(10,25,50,-1),
                                             c(10,25,50,"All"))) )
# Exportarlo desde el R-studio

####
### vis
library(naniar)
## 
## Attaching package: 'naniar'
## The following object is masked from 'package:skimr':
## 
##     n_complete
vis_miss(datos)